## Read Nielsen ads, filter to DMAs with overlap with STB data and dates in 9/1 - 11/6

library(data.table)
library(lubridate)
library(tidyverse)
library(hms)

### SET WORKING DIRECTORY HERE 
path_to_archive <- "replication"
setwd(path_to_archive)

# by ad sponsor type
load("data/dma_timezone.RData")
dma_timezone <- dma_timezone %>% 
	select(dma_code, timezone) %>%
	mutate(timezone = recode(timezone, ETZ = "America/New_York", CTZ = "America/Chicago", PTZ="America/Los_Angeles")) %>%
	filter(!is.na(dma_code))

### NOTE: THESE FILES ARE PROPRIETARY NIELSEN DATA (NOT INCLUDED IN ARCHIVE)
### data must be licensed through Nielsen first; contact Ethan Markowitz (ethan.v.markovitz@nielsen.com) for details on licensing
load("data/sengov_ads_2010_2014.RData")
load("data/congressional_ads_2010_2014.RData")
load("data/pac_ads_2010_2014.RData")
load("data/pres_ads_2012.RData")
### END PROPRIETARY DATA

ads_dma <- read_csv("data/dmacodes.csv") %>% 
  mutate(Market = str_trim(substr(Market,1,15)))

ads <- bind_rows(congress.ads %>% mutate(ad_type = "cand_house"), 
				sengov.ads %>% mutate(ad_type = case_when(office.x %in% c("gov", "ltgov", "oth") ~ "cand_statewide", office.x %in% c("sen", "sen2") ~ "cand_senate")), 
				group.ads %>% mutate(ad_type = "outside"), 
				pres.ads %>% mutate(ad_type = "cand_pres")) %>% 
  mutate(Market = str_trim(substr(Market,1,15))) %>%
  left_join(ads_dma) %>%
  mutate(dma_code=as.character(DMA)) %>%
  inner_join(dma_timezone) %>%  ## restricts to ads in STB markets
  mutate(
  	channel = sub(" *([DKMNWX][A-Z]+) \\(([A-Z]+)\\) *", "\\1", Distributor),
  	affiliate = sub(" *([DKMNWX][A-Z]+) \\(([A-Z]+)\\) *", "\\2", Distributor),
  	program = str_trim(Program.Title)
  	)

# select dates between 9/1 and 11/6
ads <- ads %>% 
  mutate(Date = mdy(Date), Time = as.hms(Time)) %>%
  filter(Date>=mdy("09/01/2012"), Date<=mdy("11/06/2012")) %>%
  mutate(ad_time_utc = case_when(timezone == "America/New_York" ~ ymd_hms(paste(as.character(Date), as.character(Time), sep=" "), tz="America/New_York"),
							  timezone == "America/Chicago" ~ ymd_hms(paste(as.character(Date), as.character(Time), sep=" "), tz="America/Chicago"),
							  timezone == "America/Los_Angeles" ~ ymd_hms(paste(as.character(Date), as.character(Time), sep=" "), tz="America/Los_Angeles"),
							  TRUE ~ ymd_hms(paste(as.character(Date), as.character(Time), sep=" "), tz="America/New_York")) %>%
								with_tz(tzone="UTC")) %>%
  as.data.table


ads[, c("X...", "GRP..P18.", "IMP.000...P18.", "CPM..P18.", "Brand", "Product.Category") := NULL] # remove proprietary cost / impressions cols
ads %>% saveRDS("data/final_ads.rds")